{
 "nbformat": 4,
 "nbformat_minor": 2,
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "version": "3.7.6-final"
  },
  "orig_nbformat": 2,
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "kernelspec": {
   "name": "python37664bitwuhan2019ncovpipenv5f20d42ee8464e6b95cd78c3cee9d475",
   "display_name": "Python 3.7.6 64-bit ('Wuhan-2019-nCoV': pipenv)"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "import yaml\n",
    "from metadata import get_china_province_code, get_china_city_code, get_china_area_name\n",
    "\n",
    "def read_report(fp):\n",
    "    with open(fp, \"r\") as f:\n",
    "        return yaml.load(f, Loader=yaml.Loader)\n",
    "\n",
    "def parse_int(text):\n",
    "    if text:\n",
    "        result = re.search(\"\\\\d+\", text)\n",
    "        if result:\n",
    "            return result.group()\n",
    "\n",
    "def parse_list(text, keys):\n",
    "    if text:\n",
    "        result = []\n",
    "        for i in re.finditer(\"([\\u4e00-\\u9fa5]+)(\\\\d+)\", text):\n",
    "            result.append({\n",
    "                keys[0]: i.group(1),\n",
    "                keys[1]: i.group(2)\n",
    "            })\n",
    "        return result\n",
    "\n",
    "def normalize_list(area_list, area_key):\n",
    "    if area_key == \"province\":\n",
    "        for x in area_list:\n",
    "            name = x[area_key]\n",
    "            x[area_key] = get_china_area_name(get_china_province_code(name), name)\n",
    "    else:\n",
    "        for x in area_list:\n",
    "            name = x[area_key]\n",
    "            x[area_key] = get_china_city_code(get_china_province_code(name), name)\n",
    "\n",
    "def parse_report(report):\n",
    "    date = str(report[\"时间\"])\n",
    "    area_key = \"province\"\n",
    "    province = report.get(\"省\")\n",
    "    confirmed = parse_int(report.get(\"确诊\"))\n",
    "    suspected = parse_int(report.get(\"疑似\"))\n",
    "    cured = parse_int(report.get(\"治愈\"))\n",
    "    dead = parse_int(report.get(\"死亡\"))\n",
    "    area_key = \"city\" if province else \"province\"\n",
    "    confirmed_list = parse_list(report.get(\"确诊详情\"), [area_key, \"confirmed\"])\n",
    "    suspected_list = parse_list(report.get(\"疑似详情\"), [area_key, \"suspected\"])\n",
    "    cured_list = parse_list(report.get(\"治愈详情\"), [area_key, \"cured\"])\n",
    "    dead_list = parse_list(report.get(\"死亡详情\"), [area_key, \"dead\"])\n",
    "\n",
    "    data = None\n",
    "    if confirmed or suspected or cured or dead:\n",
    "        data = {\n",
    "            \"confirmed\": confirmed,\n",
    "            \"suspected\": suspected,\n",
    "            \"cured\": cured,\n",
    "            \"dead\": dead\n",
    "        }\n",
    "        if province:\n",
    "            data[\"provinceCode\"] = get_china_province_code(province)\n",
    "            data[\"province\"] = get_china_area_name(data[\"provinceCode\"], province)\n",
    "\n",
    "    for data_list in [confirmed_list, suspected_list, cured_list, dead_list]:\n",
    "        if data_list:\n",
    "            for x in data_list:\n",
    "                if province:\n",
    "                    x[\"provinceCode\"] = get_china_province_code(province)\n",
    "                    x[\"province\"] = get_china_area_name(x[\"provinceCode\"], province)\n",
    "                    x[\"cityCode\"] = get_china_city_code(x[\"provinceCode\"], x[\"city\"])\n",
    "                    x[\"city\"] = get_china_area_name(x[\"cityCode\"], x[\"city\"])\n",
    "                else:\n",
    "                    x[\"provinceCode\"] = get_china_province_code(x[\"province\"])\n",
    "                    x[\"province\"] = get_china_area_name(x[\"provinceCode\"], x[\"province\"])\n",
    "\n",
    "    df_list = [pd.DataFrame(x) for x in [confirmed_list, suspected_list, cured_list, dead_list] if x]\n",
    "    df = None\n",
    "    for index, x in enumerate(df_list):\n",
    "        if df is None:\n",
    "            df = x\n",
    "        else:\n",
    "            df = pd.merge(df, x, on=area_key, how=\"outer\", suffixes=[\"\", f\"\"\"_{index}\"\"\"], sort=False, copy=False)\n",
    "\n",
    "    columns = [\"date\",\"country\",\"countryCode\",\"province\",\"provinceCode\",\"city\",\"cityCode\",\"confirmed\",\"suspected\",\"cured\",\"dead\"]\n",
    "    if df is None:\n",
    "        df = pd.DataFrame([data], columns=columns)\n",
    "    else:\n",
    "        df = pd.DataFrame(df, columns=columns)\n",
    "        df = df.append([data])\n",
    "    df[\"date\"] = date\n",
    "    df[\"country\"] = \"中国\"\n",
    "    df[\"countryCode\"] = \"CN\"\n",
    "    df[\"province\"].fillna(\"\", inplace=True)\n",
    "    df[\"provinceCode\"].fillna(\"\", inplace=True)\n",
    "    df[\"city\"].fillna(\"\", inplace=True)\n",
    "    df[\"cityCode\"].fillna(\"\", inplace=True)\n",
    "    df.sort_values([\"date\", \"countryCode\", \"provinceCode\", \"cityCode\"], inplace=True)\n",
    "    return df\n",
    "\n",
    "for r in os.listdir(\"Report\"):\n",
    "    report = read_report(os.path.join(\"Report\", r))\n",
    "    report_data = parse_report(report)\n",
    "    report_data.to_csv(f\"\"\"ReportData/{report.get(\"时间\")}{report.get(\"省\", \"\")}.csv\"\"\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>date</th>\n      <th>country</th>\n      <th>countryCode</th>\n      <th>province</th>\n      <th>provinceCode</th>\n      <th>city</th>\n      <th>cityCode</th>\n      <th>confirmed</th>\n      <th>suspected</th>\n      <th>cured</th>\n      <th>dead</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>729</td>\n      <td>NaN</td>\n      <td>32.0</td>\n      <td>39.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>武汉市</td>\n      <td>420100.0</td>\n      <td>572</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>38.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>十堰市</td>\n      <td>420300.0</td>\n      <td>5</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>宜昌市</td>\n      <td>420500.0</td>\n      <td>1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>鄂州市</td>\n      <td>420700.0</td>\n      <td>1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>荆门市</td>\n      <td>420800.0</td>\n      <td>21</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>孝感市</td>\n      <td>420900.0</td>\n      <td>26</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>荆州市</td>\n      <td>421000.0</td>\n      <td>10</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>黄冈市</td>\n      <td>421100.0</td>\n      <td>64</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>随州市</td>\n      <td>421300.0</td>\n      <td>5</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>恩施土家族苗族自治州</td>\n      <td>422800.0</td>\n      <td>11</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>仙桃市</td>\n      <td>429004.0</td>\n      <td>10</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>2020-01-24</td>\n      <td>中国</td>\n      <td>CN</td>\n      <td>湖北省</td>\n      <td>420000</td>\n      <td>天门市</td>\n      <td>429006.0</td>\n      <td>3</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
      "text/plain": "          date country countryCode province  provinceCode        city  \\\n0   2020-01-24      中国          CN      湖北省        420000         NaN   \n1   2020-01-24      中国          CN      湖北省        420000         武汉市   \n2   2020-01-24      中国          CN      湖北省        420000         十堰市   \n3   2020-01-24      中国          CN      湖北省        420000         宜昌市   \n4   2020-01-24      中国          CN      湖北省        420000         鄂州市   \n5   2020-01-24      中国          CN      湖北省        420000         荆门市   \n6   2020-01-24      中国          CN      湖北省        420000         孝感市   \n7   2020-01-24      中国          CN      湖北省        420000         荆州市   \n8   2020-01-24      中国          CN      湖北省        420000         黄冈市   \n9   2020-01-24      中国          CN      湖北省        420000         随州市   \n10  2020-01-24      中国          CN      湖北省        420000  恩施土家族苗族自治州   \n11  2020-01-24      中国          CN      湖北省        420000         仙桃市   \n12  2020-01-24      中国          CN      湖北省        420000         天门市   \n\n    cityCode  confirmed  suspected  cured  dead  \n0        NaN        729        NaN   32.0  39.0  \n1   420100.0        572        NaN    NaN  38.0  \n2   420300.0          5        NaN    NaN   NaN  \n3   420500.0          1        NaN    NaN   1.0  \n4   420700.0          1        NaN    NaN   NaN  \n5   420800.0         21        NaN    NaN   NaN  \n6   420900.0         26        NaN    NaN   NaN  \n7   421000.0         10        NaN    NaN   NaN  \n8   421100.0         64        NaN    NaN   NaN  \n9   421300.0          5        NaN    NaN   NaN  \n10  422800.0         11        NaN    NaN   NaN  \n11  429004.0         10        NaN    NaN   NaN  \n12  429006.0          3        NaN    NaN   NaN  "
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "df_list = [pd.read_csv(os.path.join(\"ReportData\", x)) for x in os.listdir(\"ReportData\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": "['2020-01-20.csv',\n '2020-01-20湖北省.csv',\n '2020-01-21.csv',\n '2020-01-21湖北省.csv',\n '2020-01-22.csv',\n '2020-01-22湖北省.csv',\n '2020-01-23.csv',\n '2020-01-23湖北省.csv',\n '2020-01-24.csv',\n '2020-01-24湖北省.csv',\n '2020-01-25.csv',\n '2020-01-25湖北省.csv',\n '2020-01-26.csv',\n '2020-01-26湖北省.csv',\n '2020-01-27.csv',\n '2020-01-27湖北省.csv',\n '2020-01湖北省武汉市.csv']"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "sorted(os.listdir(\"ReportData\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "csv_file = \"ReportData/2020-01湖北省武汉市.csv\"\n",
    "dtype = {\"date\": str, \"provinceCode\": str, \"cityCode\": str}\n",
    "columns = [\"date\",\"country\",\"countryCode\",\"province\",\"provinceCode\",\"city\",\"cityCode\",\"confirmed\",\"suspected\",\"cured\",\"dead\"]\n",
    "df = pd.read_csv(csv_file, dtype=dtype)\n",
    "df_country = pd.DataFrame(df.groupby([\"date\",\"country\",\"countryCode\"],as_index=False).sum(), columns=columns)\n",
    "df_province = pd.DataFrame(df.groupby([\"date\",\"country\",\"countryCode\",\"province\",\"provinceCode\"],as_index=False).sum(),columns=columns)\n",
    "df = pd.concat([df, df_country, df_province])\n",
    "df[\"country\"].fillna(\"\", inplace=True)\n",
    "df[\"countryCode\"].fillna(\"\", inplace=True)\n",
    "df[\"province\"].fillna(\"\", inplace=True)\n",
    "df[\"provinceCode\"].fillna(\"\", inplace=True)\n",
    "df[\"city\"].fillna(\"\", inplace=True)\n",
    "df[\"cityCode\"].fillna(\"\", inplace=True)\n",
    "df[\"confirmed\"] = df[\"confirmed\"].fillna(0).astype(int)\n",
    "df[\"suspected\"] = df[\"suspected\"].fillna(0).astype(int)\n",
    "df[\"cured\"] = df[\"cured\"].fillna(0).astype(int)\n",
    "df[\"dead\"] = df[\"dead\"].fillna(0).astype(int)\n",
    "df.drop_duplicates(\n",
    "    subset=[\"date\", \"country\", \"province\", \"city\"], inplace=True)\n",
    "df.sort_values([\"date\", \"countryCode\", \"provinceCode\", \"cityCode\", \"city\"], inplace=True)\n",
    "df \n",
    "# df.to_csv(csv_file, index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}